selenium, Phantomjs use to crawl paper

本意是用scrapy去抓取文件的, 不过pdf下载部分一直搞不定, 只好先找到页面连接, 然后用selenium和phantomjs来模拟浏览器的行为, 采用点击,另存为的方法
不得不说, 标准的论文数据库还是对下载做了许多限制的, 比如我点击论文的pdf地址, 后退回原网页,再重新点击pdf地址,也会给出警告,然后转移到请求登录的界面。
这里给出一个简单的单页抓取模板, 但是这种方法效率并不高,SSRN数据库加载的很慢。

1. 基本下载模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python
# encoding: utf-8
__author__ = 'dm'

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import requests
from bs4 import BeautifulSoup
import time
import urllib.request

driver = webdriver.PhantomJS()
driver.set_window_size(1120,550)
paper_url = "http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2702516"
driver.get(paper_url)
# driver.get("http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2703212")
content = driver.page_source
flag = 1
try:
download_button = driver.find_element_by_id("openDownloadLink1")
time.sleep(2)
download_button.click()
if "Data_Integrity_Notice" in driver.current_url:
driver.find_element_by_css_selector("#AnonymousTab > a.deselected > nobr").click()
driver.find_element_by_name("ProcessAnym").click()
else:
pdf_name = paper_url[51:] +".pdf"
urllib.request.urlretrieve(driver.current_url, pdf_name)
except NoSuchElementException as e:
print("need buy it")
flag = 0
driver.quit()

2. 测试脚本

用selenium直接导出了一个测试脚本, 非常方便, 推荐大家使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re

class Test1(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome()
self.driver.implicitly_wait(30)
self.base_url = "http://papers.ssrn.com/"
self.verificationErrors = []
self.accept_next_alert = True

def test_1(self):
driver = self.driver
driver.get(self.base_url + "/sol3/papers.cfm?abstract_id=2697209&download=yes")
driver.find_element_by_id("openDownloadLink1").click()
driver.find_element_by_css_selector("#AnonymousTab > a.deselected > nobr").click()
driver.find_element_by_name("ProcessAnym").click()

def is_element_present(self, how, what):
try: self.driver.find_element(by=how, value=what)
except NoSuchElementException as e: return False
return True

def is_alert_present(self):
try: self.driver.switch_to.alert()
except NoAlertPresentException as e: return False
return True

def close_alert_and_get_its_text(self):
try:
alert = self.driver.switch_to.alert()
alert_text = alert.text
if self.accept_next_alert:
alert.accept()
else:
alert.dismiss()
return alert_text
finally: self.accept_next_alert = True

def tearDown(self):
self.driver.quit()
self.assertEqual([], self.verificationErrors)

if __name__ == "__main__":
unittest.main()

3. PhantomJS以及firefox设置代理

需要在配置文件中加入以下内容

3.1 PhantomJS

1
2
3
4
5
service_args = [
'--proxy=127.0.0.1:1080' ,
'--proxy-type=socks5' ,
]
driver = webdriver.PhantomJS(service_args= service_args)

3.2 Firefox

1
2
3
4
5
6
7
8
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1) #默认值0,就是直接连接;1就是手工配置代理。
profile.set_preference('network.proxy.socks', proxyip) #proxyip需自定义, 我用的是本地的代理"127.0.0.1"
profile.set_preference('network.proxy.socks_port', port) #port自定义
profile.set_preference('network.proxy.ssl',ip)
profile.set_preference('network.proxy.ssl_port', port)
profile.update_preferences()
browser = webdriver.Firefox(profile)